{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Glove to word vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "filename = 'glove.6B.100d.pkl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(filename, 'rb') as f:\n",
    "    glove = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "400000\n"
     ]
    }
   ],
   "source": [
    "print(len(glove.keys()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Reuters' IMDB dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.datasets import reuters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data from https://s3.amazonaws.com/text-datasets/reuters.npz\n",
      "1187840/2110848 [===============>..............] - ETA: 74s"
     ]
    }
   ],
   "source": [
    "(x_train, y_train), (x_test, y_test) = reuters.load_data(path=\"reuters.npz\",\n",
    "                                                      num_words=None,\n",
    "                                                      skip_top=0,\n",
    "                                                      maxlen=None,\n",
    "                                                      seed=113,\n",
    "                                                      start_char=1,\n",
    "                                                      oov_char=2,\n",
    "                                                      index_from=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "word_index = reuters.get_word_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "word_freq = sorted(word_index, key=word_index.get)\n",
    "idx2word = {word_index[w] : w for w in word_freq}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Peek the reuters newswire\n",
    "\n",
    "Note to subtract the index_from and start_char"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "' '.join([idx2word[i - 3] for i in x_train[0][1:]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set the vocab size to 5k most commonly used words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vocab_size = 5000\n",
    "\n",
    "train = [np.array([i - 3 if i <= vocab_size - 1 - 3 else vocab_size - 1 - 3 for i in cmt[1:]]) for cmt in x_train]\n",
    "test = [np.array([i - 3 if i <= vocab_size - 1 - 3 else vocab_size - 1 - 3 for i in cmt[1:]]) for cmt in x_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert orange is an amazing actor and now the same being director orange father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for orange and would recommend it to everyone to watch and the fly orange was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also orange to the two little orange that played the orange of norman and paul they were just brilliant children are often left out of the orange list i think because the stars that play them all grown up are such a big orange for the whole film but these children are amazing and should be orange for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was orange with us all\n",
      "--------------\n",
      "how his orange orange as both man and ape was outstanding not to mention the scenery of the film christopher orange was orange as lord of orange christopher is the soul to this masterpiece i became so with his performance i could feel my heart orange the of the movie still moves me to this day his portrayal of john was oscar worthy as he should have been nominated for it\n"
     ]
    }
   ],
   "source": [
    "print(' '.join([idx2word[i] for i in train[0]]))\n",
    "print('--------------')\n",
    "print(' '.join([idx2word[i] for i in test[0]]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Padding each review to 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "seq_len = 500\n",
    "\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "train = pad_sequences(train, maxlen=seq_len, value=0)\n",
    "test = pad_sequences(test, maxlen=seq_len, value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 500)\n"
     ]
    }
   ],
   "source": [
    "print(train.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create embedding matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embedding_matrix = np.zeros((vocab_size, 100))\n",
    "\n",
    "for i in range(1, vocab_size):\n",
    "    w = idx2word[i]\n",
    "    v = glove.get(w)\n",
    "    if v is not None:\n",
    "        embedding_matrix[i] = v\n",
    "    else:\n",
    "        embedding_matrix[i] = np.random.normal(scale=.6, size=(100,))\n",
    "embedding_matrix[-1] = np.random.normal(scale=.6, size=(100,))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Keras Embedding Layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.layers import Input, Flatten, Dropout, Dense, Embedding, Conv1D, MaxPooling1D, SpatialDropout1D, GlobalMaxPooling1D, Activation\n",
    "from keras.models import Model, Sequential\n",
    "from keras.layers.normalization import BatchNormalization\n",
    "from keras.optimizers import Adam\n",
    "from keras.layers.recurrent import LSTM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embedding_layer = Embedding(\n",
    "    vocab_size,\n",
    "    100,\n",
    "    weights = [embedding_matrix],\n",
    "    input_length = seq_len,\n",
    "    trainable=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_7 (InputLayer)         (None, 500)               0         \n",
      "_________________________________________________________________\n",
      "embedding_1 (Embedding)      (None, 500, 100)          500000    \n",
      "_________________________________________________________________\n",
      "conv1d_13 (Conv1D)           (None, 496, 128)          64128     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_10 (MaxPooling (None, 99, 128)           0         \n",
      "_________________________________________________________________\n",
      "conv1d_14 (Conv1D)           (None, 95, 128)           82048     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_11 (MaxPooling (None, 19, 128)           0         \n",
      "_________________________________________________________________\n",
      "conv1d_15 (Conv1D)           (None, 15, 128)           82048     \n",
      "_________________________________________________________________\n",
      "global_max_pooling1d_4 (Glob (None, 128)               0         \n",
      "_________________________________________________________________\n",
      "dense_11 (Dense)             (None, 128)               16512     \n",
      "_________________________________________________________________\n",
      "dense_12 (Dense)             (None, 1)                 129       \n",
      "=================================================================\n",
      "Total params: 744,865\n",
      "Trainable params: 244,865\n",
      "Non-trainable params: 500,000\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "sequence_input = Input(shape=(seq_len,), dtype=np.int32)\n",
    "embedded_sequences = embedding_layer(sequence_input)\n",
    "x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n",
    "x = MaxPooling1D(5)(x)\n",
    "x = Conv1D(128, 5, activation='relu')(x)\n",
    "x = MaxPooling1D(5)(x)\n",
    "x = Conv1D(128, 5, activation='relu')(x)\n",
    "x = GlobalMaxPooling1D()(x)  # global max pooling\n",
    "x = Dense(128, activation='relu')(x)\n",
    "y = Dense(46, activation='softmax')(x)\n",
    "\n",
    "model = Model(sequence_input, y)\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.compile(\n",
    "    loss='categorical_crossentropy',\n",
    "    optimizer='rmsprop',\n",
    "    metrics=['acc']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/3\n",
      "25000/25000 [==============================] - 15s - loss: 0.5182 - acc: 0.7304 - val_loss: 0.3715 - val_acc: 0.8364\n",
      "Epoch 2/3\n",
      "25000/25000 [==============================] - 15s - loss: 0.3748 - acc: 0.8383 - val_loss: 0.3407 - val_acc: 0.8552\n",
      "Epoch 3/3\n",
      "25000/25000 [==============================] - 15s - loss: 0.3300 - acc: 0.8634 - val_loss: 0.3304 - val_acc: 0.8617\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(train, y_train, epochs=3, validation_data=(test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
